#Mengimport Library yang Diperlukan
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Memanggil Dataset ke dalam Program 
- df_penerima = pd.read_csv('K-Nearest Neighbor (K-NN).csv')
- from google.colab import files
uploades = files.upload()

#Menampilkan Dataset 
df_penerima = pd.read_csv('K-Nearest Neighbor (K-NN).csv', sep= ";")
df_penerima

#Untuk Menjelaskan Jumlah Data dan Jumlah Atribut
df_penerima.shape

#Menjelaskan Nama Atribut di Dalam Dataset
df_penerima.columns

#Memberikan Informasi Keseluruhan Terkait Dataset
df_penerima.info()

df_penerima.describe().transpose()

#Mengidentifikasi Nilai Tak Wajar Sebagai Missing Value 
df_penerima['Jenis Kerusakan']=df_penerima['Jenis Kerusakan'].replace(0,np.nan)
df_penerima['Tingkat Risiko']=df_penerima['Tingkat Risiko'].replace(0,np.nan)
df_penerima['Frekuensi Kerusakan']=df_penerima['Frekuensi Kerusakan'].replace(0,np.nan)
df_penerima['Bagian Kerusakan']=df_penerima['Bagian Kerusakan'].replace(0,np.nan)
df_penerima.head(60)

#Cek Missing Value
total=df_penerima.isnull().sum().sort_values(ascending = False)
print(total)

#Membagi ke dalam variabel x dan y
x = df_penerima.iloc[:, [1,2]].values
y = df_penerima.iloc[:,-1].values

print(x)

print(y)

#Membagi Dataset Menjadi Data Training dan Data Testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size =0.10, random_state=42)

print(x_train)

print(x_test)

len(x_train)

len(x_test)

print(y_train)

#Scaling data 
#Digunakan agar jarak antar data tidak terlalu jauh 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#Mencetak x_train yang telah di scaling
print(x_train)

#Mencetak x_test 
print(x_test)

#Melakukan Training dan Melakukan prediksi
from sklearn.neighbors import KNeighborsClassifier

k = 20
#Train model and predict
knn = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train)

yhat= knn.predict(x_test)
yhat[0:5]

#Evaluasi dan validasi dengan menggunakan confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, yhat, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
disp.plot()

plt.show()

#Nilai precision, recall, f1-score dan accuracy 
print(metrics.classification_report(y_test, yhat, digits=3))

#Menghitung akurasi 
from sklearn import metrics
print("Train set Accuracy:", metrics.accuracy_score(y_train,knn.predict(x_train)))
print ("Test set Accuracy:", metrics.accuracy_score(y_test,yhat))

#Melihat nilai akurasi dalam bentuk persen
from sklearn.metrics import accuracy_score
akurasi = accuracy_score(y_test,yhat)
print("Tingkat Akurasi :%d persen"%(akurasi*100))

#Melakukan pengujian nilai k=1 hingga k=50
Ks = 50
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range (1,Ks):

  #Train Model and Predict
  neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
  yhat = neigh.predict(x_test)
  mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

  std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

  mean_acc

#Menampilkan grafik hasil pengujian
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy', '+/-3xstd'))
plt.ylabel('Accuracy')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()


